In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
df = pd.read_csv('Classified Data',index_col=0)
#set index_col=0 to use the first column as the index
In [3]:
df.head()
Out[3]:
Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.
In [16]:
from sklearn.preprocessing import StandardScaler
In [17]:
scaler = StandardScaler()
In [18]:
scaler.fit(df.drop('TARGET CLASS',axis=1))
Out[18]:
In [19]:
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))
In [20]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat
Out[20]:
In [21]:
from sklearn.model_selection import train_test_split
In [22]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['TARGET CLASS'],
test_size=0.30)
In [23]:
from sklearn.neighbors import KNeighborsClassifier
In [24]:
knn = KNeighborsClassifier(n_neighbors=1)
In [25]:
knn.fit(X_train,y_train)
Out[25]:
In [27]:
pred = knn.predict(X_test)
In [28]:
from sklearn.metrics import classification_report,confusion_matrix
In [29]:
print(confusion_matrix(y_test,pred))
In [30]:
print(classification_report(y_test,pred))
In [31]:
error_rate = []
for i in range(1,30):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
In [38]:
plt.figure(figsize=(10,6))
plt.plot(range(1,30),error_rate,color='blue', linestyle='-', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Out[38]:
In [39]:
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
In [40]:
# NOW WITH K=16
knn = KNeighborsClassifier(n_neighbors=16)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=16')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
In [ ]: